library("ggplot2")
library("gridExtra")
library("readr")
library("dplyr")
library("lubridate")
library("janitor")
library("tidyr")
library("tidyverse")
library("DataExplorer")
library("reshape2")
library("data.table")
library("DT")
library("d3heatmap")
library("tigerstats")
library("corrplot")
library("viridis")
library("plotly")
library("tm")
library("RColorBrewer")
library("leaflet")
library("wordcloud")
raw_crime = read.csv("E:/Masters/Sem 5/comp & visua analystics/b.csv", sep = ",", na.strings =c('','NA','na','N/A','n/a','NaN','nan'), strip.white = TRUE, stringsAsFactors = FALSE)
df<- raw_crime
# Cleaning the data
df <- clean_names(df)
df_info <- function(x) {
data <- as.character(substitute(x))
size <- format(object.size(x), units="Mb")
plot_missing(data.frame(x))
column.info <- data.frame( column = names(sapply(x, class)),
unique.values = sapply(x, function(y) length(unique(y))),
missing.count = colSums(is.na(x)),
missing.pct = round(colSums(is.na(x)) / nrow(x) * 100, 2))
row.names(column.info) <- 1:nrow(column.info)
list(data.frame = data.frame(name=data, size=size),
dimensions = data.frame(rows=nrow(x), columns=ncol(x)),
column.details = column.info)
}
Sys.timezone()
## [1] "America/New_York"
df1 = read.csv("E:/Masters/Sem 5/comp & visua analystics/a.csv", sep = ",", na.strings =c('','NA','na','N/A','n/a','NaN','nan'), strip.white = TRUE, stringsAsFactors = FALSE)
df4 <- df1
df4 <- clean_names(df4)
# Writing function to get info about our datasets
df_info <- function(x) {
data <- as.character(substitute(x)) ##data frame name
size <- format(object.size(x), units="Mb") ##size of data frame in Mb
plot_missing(data.frame(x))+
theme_classic(
)# Vizualization of Missing Data.
##column information
column.info <- data.frame( column = names(sapply(x, class)),
#class = sapply(x, class),
unique.values = sapply(x, function(y) length(unique(y))),
missing.count = colSums(is.na(x)),
missing.pct = round(colSums(is.na(x)) / nrow(x) * 100, 2))
row.names(column.info) <- 1:nrow(column.info)
list(data.frame = data.frame(name=data, size=size),
dimensions = data.frame(rows=nrow(x), columns=ncol(x)),
column.details = column.info)
}
Sys.timezone() # Will Display Time zone of your zone
## [1] "America/New_York"
# Information about the datasets
df_info(df4)

## $data.frame
## name size
## 1 df4 82.1 Mb
##
## $dimensions
## rows columns
## 1 365291 17
##
## $column.details
## column unique.values missing.count missing.pct
## 1 incident_number 322293 0 0.00
## 2 offense_code 222 0 0.00
## 3 offense_code_group 67 0 0.00
## 4 offense_description 244 0 0.00
## 5 district 13 1993 0.55
## 6 reporting_area 880 23204 6.35
## 7 shooting 2 363813 99.60
## 8 occurred_on_date 265774 0 0.00
## 9 year 5 0 0.00
## 10 month 12 0 0.00
## 11 day_of_week 7 0 0.00
## 12 hour 24 0 0.00
## 13 ucr_part 5 99 0.03
## 14 street 4748 11536 3.16
## 15 lat 18484 23399 6.41
## 16 long 18484 23399 6.41
## 17 location 18499 0 0.00
time_diff <- c("0", "6", "12", "18", "24") # Breaking day into 4 intervals
df$time_diff <- cut(df$hour,
breaks = time_diff,
labels = c("00-06", "06-12", "12-18", "18-24"),
include.lowest = TRUE)
table(df$time_diff)
##
## 00-06 06-12 12-18 18-24
## 38643 74361 92128 58697
#createing Shift plot
df <- df %>% mutate(shift = ifelse(time_diff == "00-06", "Late Night",
ifelse(time_diff == "06-12", "Morning",
ifelse(time_diff == "12-18", "Day",
"Evening"))))
x <- table(df$shift)
x <- as.table(x)
x/sum(margin.table(x, 1))
##
## Day Evening Late Night Morning
## 0.3491959 0.2224812 0.1464699 0.2818530
plot_crime_offense_category = plot_ly(df, x = ~offense_code_group , color = ~shift) %>%
add_histogram() %>%
layout(
title = "Total crime count distributed by hour",
xaxis = list(title = "crime",
yaxis = list(title = "Count"
#marker = list(color = colorRampPalette(brewer.pal(11,"Spectral"))(100))
)))
plot_crime_offense_category
ca_crime_df <- df[which(as.numeric(df$year) < 2018), ]
ca_crime_df %>%
filter(!is.na(ca_crime_df$district)) %>%
group_by(district) %>%
summarise(count = n(),na.rm = TRUE) %>%
arrange(desc(count)) %>%
ungroup() %>%
mutate(district = reorder(district, count)) %>%
ggplot(aes(x = district, y = count))+
geom_bar(stat = "identity", color = "white", fill = "skyblue")+
geom_text(aes(x= district, label = paste0(count, sep = "")),
hjust =1, vjust =.5, size = 3, color = 'black', fontface = 'italic')+
labs(x = "Neighborhood", y = "count",title = "Total crime in Each Neighborhood in 2016 & 2017 ")+
coord_flip()+ theme_classic()

#df_2018 <- df[which(as.numeric(df$year) == 2018), ]
df %>%
filter(!is.na(district)) %>%
group_by(district) %>%
summarise(count = n(),na.rm = TRUE) %>%
arrange(desc(count)) %>%
ungroup() %>%
mutate(district = reorder(district, count)) %>%
ggplot(aes(x = district, y = count))+
geom_bar(stat = "identity", color = "white", fill = "skyblue")+
geom_text(aes(x= district, label = paste0(count, sep = "")),
hjust =.5, vjust =0, size = 3, color = 'black', fontface = 'italic')+
labs(x = "Neighborhood", y = "Count", title = "Total crime in each Neighboorhood for 2016 - 2018")+
theme(axis.text.x = element_text(angle = 90, hjust = 1))

df %>%
filter(!is.na(offense_code_group)) %>%
group_by(offense_code_group) %>%
summarise(count = n(),na.rm = TRUE) %>%
arrange(desc(count)) %>%
ungroup() %>%
mutate(offense_code_group = reorder(offense_code_group, count)) %>%
head(10)%>%
ggplot(aes(x = offense_code_group, y = count,fill = offense_code_group)) +
geom_bar(stat = "identity", color = "white") +
geom_text(aes(x= offense_code_group, label = paste0( " ",count)),
hjust =1, vjust =.5, size = 4, color = 'black', fontface = 'italic')+
labs(x = "Crime", y = "Count", title = "Top crime in Boston distibuted")+
coord_flip()+
theme_classic()

# length(unique(df$crime)) # We have 54 different types of crime which are reported.
y <- ca_crime_df %>% filter(!is.na(offense_code_group)) %>% group_by(offense_code_group) %>% summarise(count = n(),na.rm = TRUE) %>% arrange(desc(count)) %>% ungroup() %>% mutate(offense_code_group = reorder(offense_code_group, count))
z <- df %>%
filter(!is.na(year)) %>%
group_by(year) %>%
summarise(count = n(),na.rm = TRUE) %>%
arrange(desc(count)) %>%
ungroup() %>%
mutate(year = reorder(year, count)) #%>%
ggplot(z, aes(x = year, y = count))+ scale_color_gradient(low = "white", high = "red") +
geom_bar(stat = "identity", color = "blue", fill = "Lightblue", width = 0.25)+
geom_text(aes(x= year, label = paste0(" ",count)),
hjust =1, vjust =.25, size = 4, color = 'black', fontface = 'italic')+
labs(x = "Crime", y = "Count", title = "Total crime in Boston from year 2016-2018 ")+
coord_flip()+
theme_classic()

ggplot(subset(df,!is.na(district)))+
aes(x=month, color=district)+
geom_line(stat="count")+
scale_x_continuous(breaks = seq(1,12,1))+
labs(title="No. Incidents by Neighborhood on Monthly Basis", x="Neighborhood", y="Number of Incidents")+
theme_classic()

ggplot(df, aes(x = hour)) +
geom_area(width=0.8, stat="Count") +
labs(title="Reported Crime start time (in Hours)", x="Hour (Format - 24Hrs)", y="Number of Count")+
theme_classic()

counts <- summarise(group_by(ca_crime_df, offense_code_group,month),Counts=length(offense_code_group))
counts <- counts[order(counts$month), ]
crime_plot <- dcast(counts,month ~ offense_code_group, value.var = "Counts" )
crime_plot[is.na(crime_plot)] <- 0
row.names(crime_plot) <- crime_plot$month # Make month row names
crime_plot = crime_plot[,-1] # Remove first
crime_plot <- cor(crime_plot)
corrplot(crime_plot, type = "lower", order = "hclust", method = "color",
tl.col = "black", tl.srt = 45,number.cex=0.60,tl.cex = 0.50)+
theme_classic()

## NULL
# What are top Crimes in each district ?
district_by_crime <- ca_crime_df %>%
group_by(district, offense_code_group) %>%
dplyr::summarise(Total = n()) %>%
arrange(desc(Total)) %>% top_n(n = 1)
## Selecting by Total
head(district_by_crime,10)
## # A tibble: 10 x 3
## # Groups: district [10]
## district offense_code_group Total
## <chr> <chr> <int>
## 1 South End Larceny 4113
## 2 Roxbury Motor Vehicle Accident Response 3956
## 3 Dorchester Motor Vehicle Accident Response 3250
## 4 Downtown Larceny 2621
## 5 Mattapan Motor Vehicle Accident Response 2382
## 6 Brighton Motor Vehicle Accident Response 1794
## 7 South Boston Motor Vehicle Accident Response 1633
## 8 Hyde Park Motor Vehicle Accident Response 1431
## 9 Jamaica Plain Motor Vehicle Accident Response 1326
## 10 West Roxbury Motor Vehicle Accident Response 1142
# Lets convert above table into dataframe 2016-2017
district_by_crime <- as.data.frame(district_by_crime)
district_by_crime$district <- factor(district_by_crime$district)
district_by_crime$offense_code_group <- factor(district_by_crime$offense_code_group)
district_by_crime <- as.data.frame(district_by_crime)
ggplot(district_by_crime, aes(reorder(district,Total), Total, fill = offense_code_group)) +
geom_bar(stat = "identity") +
ggtitle("Top Crime in each district in 2016-2017") +
geom_text(aes(x= district, label = paste0(" ",Total)),
hjust =1, vjust =.25, size = 4, color = 'black', fontface = 'italic')+
xlab("district") +
ylab("Total Count") + scale_fill_discrete(name = "Offense Category") +
coord_flip()+ theme_classic()

# This is for year 2018
district_by_crime_2018 <- df %>%
filter(year == 2018) %>%
group_by(district, offense_code_group) %>%
dplyr::summarise(Total = n()) %>%
arrange(desc(Total)) %>% top_n(n = 1)
## Selecting by Total
district_by_crime_2018 <- as.data.frame(district_by_crime_2018)
district_by_crime_2018$district <- factor(district_by_crime_2018$district)
district_by_crime_2018$offense_code_group <- factor(district_by_crime_2018$offense_code_group)
ggplot(district_by_crime_2018, aes(reorder(district,Total), Total, fill = offense_code_group)) +
geom_bar(stat = "identity") +
ggtitle("Top Crime in each district 2018") +
geom_text(aes(x= district, label = paste0(" ",Total)),
hjust =1, vjust =.25, size = 3, color = 'black', fontface = 'italic')+
xlab("District") +
ylab("Total Count") + scale_fill_discrete(name = "Offense Category") +
coord_flip() + theme_classic()

